from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split

spark = SparkSession \
    .builder \
    .appName("TextFileWordCount") \
    .getOrCreate()

spark.conf.set("spark.sql.shuffle.partitions", 5)

# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
    .readStream \
    .format("text") \
    .load("D:\Data\Spark\TextFile")

# Split the lines into words
words = lines.select(
   explode(
       split(lines.value, " ")
   ).alias("word")
)

# Generate running word count
wordCounts = words.groupBy("word").count()

wordCounts.isStreaming

# Start running the query that prints the running counts to the console
query = wordCounts \
    .writeStream \
    .outputMode("complete") \
    .format("console") \
    .start()

#query.awaitTermination()
spark.streams.awaitAnyTermination()